# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 14:07                
# +--------------------------
from threading import Thread
import re
import ast  # str转为list
import requests
from scrapy import Selector
from datetime import datetime  # 把字符串转为时间类型
from urllib import parse
import time

from csdn_spider.models import *

topic_list_urls = []
topic_list = []
author_list = []
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'


# 获取js,然后通过正则提取出list数据
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
    if nodes_str_match:
        nodes_str = nodes_str_match.group(1).replace('null', 'None')  # 把js里面的None改成null
        nodes_list = ast.literal_eval(nodes_str)  # 把str转为list
        return nodes_list
    return []


# url的list
url_list = []


# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
    # 将 json的格式提取出 url到list中
    for item in nodes_list:
        if "url" in item:
            if item['url']:
                url_list.append(item['url'])
            if 'children' in item:
                process_nodes_list(item['children'])


# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
    level1_url = []
    for item in nodes_list:
        if 'url' in item and item['url']:
            level1_url.append(item['url'])

    return level1_url


# 获取最终需要抓取的url的list数据
def get_last_urls():
    # 获取 list数据
    nodes_list = get_nodes_json()
    # 处理list数据得到url的list
    process_nodes_list(nodes_list)
    # 获取1级的url
    level1_url = get_level1_list(nodes_list)
    # 经过处理之后的 url 的list
    last_urls = []
    # 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)
    all_urls = []
    # csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))
        all_urls.append(parse.urljoin(domain, url + '/recommend'))
        all_urls.append(parse.urljoin(domain, url + 'closed'))

    return all_urls


class ParseTopicAuthorThread(Thread):
    pass


class ParseTopicDetailThread(Thread):
    def run(self):
        while 1:
            try:
                url = topic_list.pop()
            except IndexError as e:
                time.sleep(1)
                continue
            print('开始获取帖子: {} '.format(url))

            # 获取帖子的详情以及回复
            topic_id = url.split('/')[-1]
            res_text = requests.get(url).text
            sel = Selector(text=res_text)
            all_divs = sel.xpath("//div[starts-with(@id, 'post-')]")  # xpath中的方法,已什么开头
            topic_item = all_divs[0]
            content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
            praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
            jtl = 0
            if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
                jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0]  # 结帖率
                jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
                if jtl_match:
                    jtl = jtl_match.group(1)

            existed_topics = Topic.select().where(Topic.id == topic_id)
            # 完成topic的更新,把少的几个字段加入进来
            if existed_topics:
                topic = existed_topics[0]
                topic.content = content
                topic.jtl = jtl
                topic.praised_nums = praised_nums
                topic.save()

            for answer_item in all_divs[1:]:
                answer = Answer()
                answer.topic_id = topic_id  # 这篇文章id

                author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
                answer.author = author_info.split('/')[-1]  # 回帖作者id

                create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
                create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
                answer.create_time = create_time  # 回帖时间

                content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
                answer.content = content

                praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
                answer.praised_nums = int(praised_nums)

                answer.save()

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_list.append(next_url)


class ParseTopicListThread(Thread):
    def run(self):
        while 1:
            try:
                url = topic_list_urls.pop()
            except IndexError as e:
                time.sleep(1)
                continue
            print('开始获取帖子列表页: {} '.format(url))

            res_text = requests.get(url).text  # 获取url的内容
            sel = Selector(text=res_text)  # 获取selector对象
            # 我们找的信息,进过观察在 table下面的 tr中
            all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
            # all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:]  #这种写法也可以
            for tr in all_trs:
                topic = Topic()

                if tr.xpath(".//td[1]/span/text()").extract():
                    status = tr.xpath(".//td[1]/span/text()").extract()[0]  # 状态 "未结" "已结" "满意"
                    topic.status = status

                if tr.xpath(".//td[2]/em/text()").extract():
                    score = tr.xpath(".//td[2]/em/text()").extract()[0]  # 赏分
                    topic.score = int(score)

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
                    topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0]  # 标题链接,相对地址
                    topic.id = int(topic_url.split('/')[-1])  # 文章的id
                    topic_url = parse.urljoin(domain, topic_url)  # 相对地址换成绝对地址,方便后续抓取

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
                    topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0]  # 标题内容
                    topic.title = topic_title

                if tr.xpath("//td[4]/a/text()").extract():
                    author_url = tr.xpath(".//td[4]/a/@href").extract()[0]  # 作者链接,相对地址
                    author_id = author_url.split('/')[-1]  # 作者id
                    author_url = parse.urljoin(domain, author_url)  # 作者链接,改成绝对地址
                    topic.author = author_id
                    # 解析用户详情页面
                    # parse_author(author_url)

                if tr.xpath(".//td[4]/em/text()").extract():
                    create_time = tr.xpath(".//td[4]/em/text()").extract()[0]  # 创建时间字符串类型
                    create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M')  # 创建时间转为时间类型
                    topic.create_time = create_time

                if tr.xpath(".//td[5]/span/text()").extract():
                    answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]  # 回复查看数量
                    answer_nums = answer_info.split('/')[0]  # 回复数量
                    click_nums = answer_info.split('/')[1]  # 查看数量
                    topic.click_nums = int(click_nums)
                    topic.answer_nums = int(answer_nums)

                if tr.xpath(".//td[6]/em/text()").extract():
                    last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]  # 最后回复时间
                    last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M')  # 把字符串转为时间类型
                    topic.last_answer_time = last_time

                try:
                    existed_topics = Topic.select().where(Topic.id == topic.id)
                    if existed_topics:
                        topic.save()
                    else:
                        topic.save(force_insert=True)
                except Exception as e:
                    pass

                # 解析帖子内容页面
                # parse_topic(topic_url)
                topic_list.append(topic_url)

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            # 如果下一页存在,就取到下一页的连接,放入 next_url

            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_list_urls.append(next_url)


if __name__ == "__main__":
    last_urls = get_last_urls()
    for url in last_urls:
        topic_list_urls.append(url)

    topic_list_thread = ParseTopicListThread()
    topic_detail_thread = ParseTopicDetailThread()

    topic_list_thread.start()
    topic_detail_thread.start()
